### NOTES:
#	An "arrival" type is an observed network type that can be obtained
#	by adding a 1-author or 2-author project to some observed type.
#	A "departure" type is an observed network type that can become
#	an arrival type by adding a 1-author or 2-author project.
###



### SELECT TYPES FOR RESIDUAL CATEGORIES

# A) Selection Parameters

# Maximum number of nodes in types sent to residual category:

if (samp) resid_max <- 25 else resid_max <- 50  ### NOTE: 50 for population, 25 for 50% sample
if (samp) deprt_max <- 10 else deprt_max <- 20  ### NOTE: 20 for population, 10 for 50% sample
if (samp) arriv_max <- 10 else arriv_max <- 20  ### NOTE: 20 for population, 10 for 50% sample

# Fraction of small non-arrival types sent to residual category:

r1_frac <- 1     ### NOTE: 1.00 used for paper

# Fraction of small arrival types sent to residual category:

r2_frac <- 0.97  ### NOTE: 0.97 used for paper

###

# B) Identify and Sample Types for Residual Categories

# i) Types within the overall cutoff to be sent to residual categories:

temp <- which(counts <= resid_max)

# Which are not observed arrival types:

temp1 <-  temp[!(temp  %in% Add_1auth[,"tb"]) ]
temp1 <- temp1[!(temp1 %in% Add_2auth[,"tb"]) ]

# And are not observed departure types for one-auth paper (above that cutoff):

temp1 <- temp1[!(temp1 %in% Add_1auth[,"t"] & counts[temp1] > deprt_max) ]

# ii) Departure and arrival types for one-auth papers:

temp2d <- temp[ (temp %in% Add_1auth[,"t" ]) ]
temp2a <- temp[ (temp %in% Add_1auth[,"tb"]) ]

# Which are below the relevant cutoffs:

temp2d <- temp2d[counts[temp2d] <= deprt_max]
temp2a <- temp2a[counts[temp2a] <= arriv_max]

# And the arrival types come from departure types below the cutoff:

temp2a <- temp2a[temp2a %in% Add_1auth[Add_1auth[,"t"] %in% temp2d, "tb"] ]

# iii) Arrival types for (only) two-auth papers:

temp2x <-   temp[ (temp %in% Add_2auth[,"tb"]) & !(temp %in% Add_1auth[,"tb"]) ]

# Which are below the relevant cutoff:

temp2x <- temp2x[counts[temp2x] <= arriv_max]

# iv) Final lists

# Combine list of arrival types:

temp2 <- unique(c(temp2a, temp2x) )

# Remove isolated types (just in case any are rare):

temp1 <- temp1[ temp1 > max(which(types[,"X1"]==0) ) ]
temp2 <- temp2[ temp2 > max(which(types[,"X1"]==0) ) ]

# v) Randomly select the types to send to residual categories 
# (weight by 1/N, and l_t / L for small arrival types)

set.seed(3554622)

resid_send1 <- sample(temp1, round(length(temp1)*r1_frac), prob = pmin(1, 1/counts[temp1]) )
resid_send2 <- sample(temp2, round(length(temp2)*r2_frac), prob = pmin(1, 1/counts[temp2])*(l_t[temp2] / L) )

resid_types <- sort(c(resid_send1,resid_send2) )

###

# C) Categorize Residual Types

# i) Identify residual types with specific features

# 1-author paper, 3 projects:

temp3 <- rowSums(N_t[resid_types,] == 1 & l_t[resid_types] > 2, na.rm=T) > 0

# 3-author paper, skill differences and generalists, 2 or 3 projects:

temp4 <- rowSums(N_t[resid_types,] == 3 & skl_t[resid_types,] == 0 & gen_t[resid_types,] == 0 & l_t[resid_types] > 1, na.rm=T) > 0
temp5 <- rowSums(N_t[resid_types,] == 3 & skl_t[resid_types,] == 1 & gen_t[resid_types,] == 0 & l_t[resid_types] > 1, na.rm=T) > 0
temp6 <- rowSums(N_t[resid_types,] == 3 & skl_t[resid_types,] == 1 & gen_t[resid_types,] == 1 & l_t[resid_types] > 1, na.rm=T) > 0

# 4-author paper, skill differences and generalists, 2 or 3 projects:

temp7 <- rowSums(N_t[resid_types,] == 4 & skl_t[resid_types,] == 0 & gen_t[resid_types,] == 0 & l_t[resid_types] > 1, na.rm=T) > 0
temp8 <- rowSums(N_t[resid_types,] == 4 & skl_t[resid_types,] == 1 & gen_t[resid_types,] == 0 & l_t[resid_types] > 1, na.rm=T) > 0
temp9 <- rowSums(N_t[resid_types,] == 4 & skl_t[resid_types,] == 1 & gen_t[resid_types,] == 1 & l_t[resid_types] > 1, na.rm=T) > 0

# ii) Make separate lists of purely residual types and combined categories

# Purely residual types:

R1_types <- resid_types[!temp3 & !temp4 & !temp5 & !temp6 & !temp7 & !temp8 & !temp9]

# Combined categories (attributes at end of lines = [N]umber of authors, [S]kill differences, [G]eneralist on team):

R2a_types <- resid_types[temp3 & !temp5 & !temp6 & !temp8 & !temp9]				# N=1, S=0, G=0

R2b_types <- resid_types[temp5 & !temp6]										# N=3, S=1, G=0
R2c_types <- resid_types[temp6]													# N=3, S=1, G=1
R2d_types <- resid_types[temp4 & !temp3 & !temp5 & !temp6 & !temp8 & !temp9]	# N=3, S=0, G=0

R2e_types <- resid_types[temp8 & !temp5 & !temp6 & !temp9]						# N=4, S=1, G=0
R2f_types <- resid_types[temp9 & !temp5 & !temp6]								# N=4, S=1, G=1
R2g_types <- resid_types[temp7 & !temp3 & !temp4 & !temp5 & !temp6 & !temp8 & !temp9]	# N=4, S=0, G=0

# Check:

all(sort(c(R1_types, R2a_types, R2b_types, R2c_types, R2d_types, R2e_types, R2f_types, R2g_types) ) == resid_types) 



### REVISE LIST OF TYPES

# A) Tables of Residual Categories

# Purely residual types (one for each combination of indiv. char's and num. of projects):

typesR1 <- matrix(0, nlevels(Z)*L, ncol(types) )
colnames(typesR1) <- colnames(types)

# Researcher characteristics (Z):

typesR1[,"Z"] <- rep(1:nlevels(Z), rep(L,nlevels(Z)) )

# Fake project categories to match number of papers:
# (-1 used to indicate )

typesR1[,"X1"] <- rep(c( -1, -1, -1), nlevels(Z) )
typesR1[,"X2"] <- rep(c(  0, -1, -1), nlevels(Z) )
typesR1[,"X3"] <- rep(c(  0,  0, -1), nlevels(Z) )
typesR1[,"X4"] <- rep(c(  0,  0,  0), nlevels(Z) )

# Rows for categories with one project (to be omitted below):

temp <- (1:nlevels(Z)) * L - (L-1)

# Make tables for combined categories of types:

typesR2a <- typesR2b <- typesR2c <- typesR2d <- typesR2e <- typesR2f <- typesR2g <- typesR1[-temp,]

# Replace first table of combined categories to only have rows for the max num. of projects:

typesR2a <- typesR1[(1:nlevels(Z)) * L,]

###

# B) Type Counts for Residual Categories

# Storage objects:

countsR1 <- rep(0, nrow(typesR1))

countsR2a <- rep(0, nrow(typesR2a) )
countsR2b <- countsR2c <- countsR2d <- countsR2e <- countsR2f <- countsR2g <- rep(0, nrow(typesR2b) )

# Purely residual types:

temp <- tapply(counts[R1_types], list(types[R1_types,"Z"], l_t[R1_types]), sum)

countsR1 <- c(t(temp) )

# Combined categories of types:

temp1 <- as.factor(typesR2a[,"Z"]) : as.factor(rowSums(typesR2a == -1) )
temp2 <- tapply(counts[R2a_types], as.factor(types[R2a_types,"Z"]) : as.factor(l_t[R2a_types]), sum)

countsR2a[ match(names(temp2), temp1) ] <- temp2

temp1 <- as.factor(typesR2b[,"Z"]) : as.factor(rowSums(typesR2b == -1) )
temp2 <- tapply(counts[R2b_types], as.factor(types[R2b_types,"Z"]) : as.factor(l_t[R2b_types]), sum)

countsR2b[ match(names(temp2), temp1) ] <- temp2

temp1 <- as.factor(typesR2c[,"Z"]) : as.factor(rowSums(typesR2c == -1) )
temp2 <- tapply(counts[R2c_types], as.factor(types[R2c_types,"Z"]) : as.factor(l_t[R2c_types]), sum)

countsR2c[ match(names(temp2), temp1) ] <- temp2

temp1 <- as.factor(typesR2d[,"Z"]) : as.factor(rowSums(typesR2d == -1) )
temp2 <- tapply(counts[R2d_types], as.factor(types[R2d_types,"Z"]) : as.factor(l_t[R2d_types]), sum)

countsR2d[ match(names(temp2), temp1) ] <- temp2

temp1 <- as.factor(typesR2e[,"Z"]) : as.factor(rowSums(typesR2e == -1) )
temp2 <- tapply(counts[R2e_types], as.factor(types[R2e_types,"Z"]) : as.factor(l_t[R2e_types]), sum)

countsR2e[ match(names(temp2), temp1) ] <- temp2

temp1 <- as.factor(typesR2f[,"Z"]) : as.factor(rowSums(typesR2f == -1) )
temp2 <- tapply(counts[R2f_types], as.factor(types[R2f_types,"Z"]) : as.factor(l_t[R2f_types]), sum, default=0)

countsR2f[ match(names(temp2), temp1) ] <- temp2

temp1 <- as.factor(typesR2g[,"Z"]) : as.factor(rowSums(typesR2g == -1) )
temp2 <- tapply(counts[R2g_types], as.factor(types[R2g_types,"Z"]) : as.factor(l_t[R2g_types]), sum)

countsR2g[ match(names(temp2), temp1) ] <- temp2

# Check counts, look for any zeros:

countsR1

cbind(countsR2a, countsR2b, countsR2c, countsR2d, countsR2e, countsR2f, countsR2g)
### NOTES: 
#	R2a repeats above.
#	R2b and R2e have two 0s.
#	Set "default=0" for R2f to avoid NA in one instance in sample.
###

###

# C) Compile Results

# i) Combine tables of types / categories

# Types to keep as is:

typesK <- types[-resid_types,]
countsK <- counts[-resid_types]

# Separate out isolated types (to put first):

typesK0 <- typesK[typesK[,"X1"]==0, ]
countsK0 <- countsK[typesK[,"X1"]==0 ]

typesK1 <- typesK[typesK[,"X1"] > 0, ]
countsK1 <- countsK[typesK[,"X1"] > 0 ]

# ii) Check 

# Counts:

sum(countsK0) + sum(countsK1) + sum(countsR1) + sum(countsR2a) + sum(countsR2b) + sum(countsR2c) + sum(countsR2d) + sum(countsR2e) + sum(countsR2f) + sum(countsR2g)
sum(counts); N_auth_exp

# Proportions of researchers in actual types vs residual categories:

sum(countsK0) / N_auth_exp
sum(countsK1) / N_auth_exp
sum(countsR1) / N_auth_exp
( sum(countsR2a) + sum(countsR2b) + sum(countsR2c) + sum(countsR2d) + sum(countsR2e) + sum(countsR2f) + sum(countsR2g) ) / N_auth_exp

# iii) Replace master table of types and vector of counts

# Types:

types -> temp1
types <- rbind(typesK0,typesR1,typesR2a,typesR2b,typesR2c,typesR2d,typesR2e,typesR2f,typesR2g,typesK1)

colnames(types) <- c("Z","X1","X2","X3","X4")

# Counts:

counts -> temp2
counts <- c(countsK0,countsR1,countsR2a,countsR2b,countsR2c,countsR2d,countsR2e,countsR2f,countsR2g,countsK1)

# Number of types:

N_types <- nrow(types)

# Observed type shares:

obs_shares <- counts / N_auth_exp

# Check:

sum(counts); N_auth_exp
sum(obs_shares)

# Remove any residual categories with zero individuals:

sum(counts == 0)

temp <- counts > 0

types <- types[temp,]

N_types <- nrow(types)

counts <- counts[temp]

obs_shares <- obs_shares[temp]



### CLEAN UP

rm(temp,temp1,temp2,temp2a,temp2d,temp2x,temp3,temp4,temp5,temp6,temp7,temp8,temp9)
